import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt


df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


df.head()


df.shape

(1460, 81)


# Check null value
df_na = df.isna().sum()
df_na[df_na > 0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


# ExterQual
mapping = {'Ex': 5,'Gd': 4, 'TA': 3,'Fa': 2,'Po': 1}
df['ExterQual'] = df['ExterQual'].map(mapping)
df['ExterCond'] = df['ExterCond'].map(mapping)
df['HeatingQC'] = df['HeatingQC'].map(mapping)
df['KitchenQual'] = df['KitchenQual'].map(mapping)
df['BsmtQual'] = df['BsmtQual'].map(mapping)
df['BsmtCond'] = df['BsmtCond'].map(mapping)
df['FireplaceQu'] = df['FireplaceQu'].map(mapping)
df['PoolQC'] = df['PoolQC'].map(mapping)


mapping_2 = {'Gd': 3,'Av': 2, 'Mn': 1}
df['BsmtExposure'] = df['BsmtExposure'].map(mapping_2)

mapping_3 = {'GLQ': 6,'ALQ': 5, 'BLQ': 4,'Rec': 3,'LwQ': 2, 'Unf': 1}
df['BsmtFinType1'] = df['BsmtFinType1'].map(mapping_3)
df['BsmtFinType2'] = df['BsmtFinType2'].map(mapping_3)

mapping_4 = {'Ex': 5,'Gd': 4, 'TA': 3,'Fa': 2,'Po': 1}
df['GarageQual'] = df['GarageQual'].map(mapping_4)
df['GarageCond'] = df['GarageCond'].map(mapping_4)

mapping_5 = {'N': 0, 'Y': 1}
df['CentralAir'] = df['CentralAir'].map(mapping_5)

mapping_6 = {'Reg': 4, 'IR1': 3, 'IR2': 2, 'IR3': 1}
df['LotShape'] = df['LotShape'].map(mapping_6)

mapping_7 = {'Gtl': 3, 'Mod': 2, 'Sev': 1}
df['LandSlope'] = df['LandSlope'].map(mapping_7)


df.shape

(1460, 81)


sns.boxplot(x=df['LotFrontage'])
plt.title('LotFrontage (with outliers)')
plt.show()


df['LotFrontage'].fillna(df['LotFrontage'].mean(), inplace =True)


df.Alley.describe

<bound method NDFrame.describe of 0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
1455    NaN
1456    NaN
1457    NaN
1458    NaN
1459    NaN
Name: Alley, Length: 1460, dtype: object>


df['Alley'].fillna('NA', inplace =True)


df.MasVnrType

0       BrkFace
1          None
2       BrkFace
3          None
4       BrkFace
         ...   
1455       None
1456      Stone
1457       None
1458       None
1459       None
Name: MasVnrType, Length: 1460, dtype: object


df['MasVnrType'].fillna('None', inplace =True)


df['MasVnrArea'].fillna(0, inplace =True)


df['BsmtQual'].fillna(df['BsmtQual'].mean(), inplace =True)


df['BsmtCond'].fillna(df['BsmtCond'].mean(), inplace =True)


df['BsmtExposure'].fillna(df['BsmtExposure'].mean(), inplace =True)


df['BsmtFinType1'].fillna(df['BsmtFinType1'].mean(), inplace =True)


df['BsmtFinType2'].fillna(df['BsmtFinType2'].mean(), inplace =True)


sns.displot(df['Electrical'])

<seaborn.axisgrid.FacetGrid at 0x127f27890>


df['Electrical'].fillna('None', inplace =True)


df.FireplaceQu.head()

0    NaN
1    3.0
2    3.0
3    4.0
4    3.0
Name: FireplaceQu, dtype: float64


df['FireplaceQu'].fillna(df['FireplaceQu'].mean(), inplace =True)


df['GarageType'].fillna('None', inplace =True)


df['GarageYrBlt'].fillna(df['GarageYrBlt'].mean(), inplace =True)


df['GarageFinish'].fillna('None', inplace =True)


df['GarageQual'].fillna('None', inplace =True)


df['GarageCond'].fillna('None', inplace =True)


df['PoolQC'].fillna(df['PoolQC'].mean(), inplace =True)


df['Fence'].fillna('None', inplace =True)


df['MiscFeature'].fillna('None', inplace =True)


df_na = df.isna().sum()
df_na[df_na > 0]

Series([], dtype: int64)


df_clean = df


df_clean['GarageYrBlt'].dtype
df_clean['GarageYrBlt'] = 2023 - df_clean['GarageYrBlt']
df_clean['YearBuilt'] = 2023 - df_clean['YearBuilt']
df_clean['YearRemodAdd'] = 2023 - df_clean['YearRemodAdd']
df_clean['YrSold'] = 2023 - df_clean['YrSold']


df_clean['YearBuilt'].head()

0     20
1     47
2     22
3    108
4     23
Name: YearBuilt, dtype: int64


df_clean.rename(columns={'GarageYrBlt': 'GarageAge'}, inplace=True)
df_clean.rename(columns={'YearBuilt': 'YearBuiltAge'}, inplace=True)
df_clean.rename(columns={'YearRemodAdd': 'YearRemodAddAge'}, inplace=True)
df_clean.rename(columns={'YrSold': 'YrSoldAge'}, inplace=True)


#df_clean['Log_GarageAge'] = np.log(df_clean['GarageAge'])


#df_clean['Log_YearBuiltAge'] = np.log(df_clean['YearBuiltAge'])


#df_clean['Log_YearRemodAddAge'] = np.log(df_clean['YearRemodAddAge'])


#df_clean['Log_YrSoldAge'] = np.log(df_clean['YrSoldAge'])


#df_clean.drop(columns=['GarageAge', 'YearBuiltAge', 'YearRemodAddAge', 'YrSoldAge'])


df_clean['SalePrice'] = np.log(df_clean['SalePrice'])


df_clean.shape

(1460, 81)


numerical_df_clean = df_clean.select_dtypes(include='number')


numerical_df_clean.shape

(1460, 52)


numerical_df_clean = numerical_df_clean.drop(columns=['Id', 'SalePrice'], axis=1)


numerical_df_clean.shape

(1460, 50)


numerical_df_clean.head()


# Scale the data
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()


scaled_numerical_df_clean = scale.fit_transform(numerical_df_clean)


scaled_numerical_df_clean

array([[ 0.07337496, -0.22937175, -0.20714171, ..., -0.08768781,
        -1.5991111 , -0.13877749],
       [-0.87256276,  0.4519361 , -0.09188637, ..., -0.08768781,
        -0.48911005,  0.61443862],
       [ 0.07337496, -0.09311018,  0.07347998, ..., -0.08768781,
         0.99089135, -0.13877749],
       ...,
       [ 0.30985939, -0.18395123, -0.14781027, ...,  4.95311151,
        -0.48911005, -1.64520971],
       [-0.87256276, -0.09311018, -0.08016039, ..., -0.08768781,
        -0.8591104 , -1.64520971],
       [-0.87256276,  0.22483348, -0.05811155, ..., -0.08768781,
        -0.1191097 , -0.13877749]])


from sklearn.decomposition import PCA


# Make an instance of the Model
pca = PCA(.95)
pca.fit(scaled_numerical_df_clean)

# Find out how many components PCA has after fitting the model
pca.n_components_

37


# Transform the data into the principal components
principal_components = pca.transform(scaled_numerical_df_clean)


principal_components.shape

(1460, 37)


principal_components

array([[ 2.18984101, -0.16763339, -1.47112319, ...,  0.10860845,
         0.07967898,  0.18651098],
       [ 0.04584576, -1.54323387,  1.0932354 , ..., -0.52074135,
         0.980782  ,  0.05223869],
       [ 2.32696368,  0.04969061, -1.34140998, ...,  0.04947632,
        -0.38301589, -0.09240722],
       ...,
       [ 1.40977495,  2.26467125,  0.85626749, ...,  1.17748467,
        -0.66432792, -0.89253948],
       [-2.71136048, -2.80817005,  1.70334363, ...,  0.44461475,
         0.1831076 , -0.55143658],
       [-0.94569008, -1.84532397,  1.62666513, ...,  1.09212882,
         0.45154769, -0.20866037]])


# Create a DataFrame from the principal components
columns = [f'PC{i+1}' for i in range(principal_components.shape[1])]
principal_df = pd.DataFrame(principal_components, columns=columns)


principal_df.head()


categorical_df_clean = df_clean.select_dtypes(exclude='number')


categorical_df_clean.shape

(1460, 29)


categorical_df_clean.head()


# List to conduct one hot-encode
list_to_encode = categorical_df_clean.columns.to_list()
list_to_encode

['MSZoning',
 'Street',
 'Alley',
 'LandContour',
 'Utilities',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'Foundation',
 'Heating',
 'Electrical',
 'Functional',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']


# Turn categorical data into dummy
categorical_df = pd.get_dummies(categorical_df_clean, columns=list_to_encode)


categorical_df.shape

(1460, 201)


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(categorical_df, df_clean['SalePrice'], test_size=0.2, random_state=42)


x_train.shape

(1168, 201)


x_test.shape

(292, 201)


from sklearn.linear_model import Lasso, Ridge

lasso_model = Lasso(alpha=10)
lasso_model.fit(x_train, y_train)

# Get the selected feature names
selected_feature_names = categorical_df.columns[lasso_model.coef_ != 0]

# Print or use the selected feature names as needed
print("Selected Feature Names:", selected_feature_names)

Selected Feature Names: Index([], dtype='object')


selected_feature_names = selected_feature_names.to_list()


concat_df = pd.concat([principal_df, categorical_df[selected_feature_names]], axis=1)


concat_df.head()


concat_df.shape

(1460, 37)


df_clean
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(concat_df, df_clean['SalePrice'], test_size=0.2, random_state=42)

# Train the model - Random Forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42) 
rf.fit(x_train, y_train)

# Test the model
y_pred = rf.predict(x_test)

# Evaluate the model
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming y_pred is the predictions from your regression model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test, y_pred)


print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r_squared)

Mean Absolute Error: 0.10416006535399967
Mean Squared Error: 0.024199680182249703
Root Mean Squared Error (RMSE): 0.15556246392446252
R-squared: 0.8703219337513204


import xgboost as xgb

from numpy import loadtxt
from xgboost import XGBRegressor

# fit model no training data
model = XGBRegressor()
model.fit(x_train, y_train)

# Test the model
y_pred = model.predict(x_test)

# Evaluate the model
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming y_pred is the predictions from your regression model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test, y_pred)


print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r_squared)

Mean Absolute Error: 0.10884051148169183
Mean Squared Error: 0.02466610410093325
Root Mean Squared Error (RMSE): 0.15705446221274086
R-squared: 0.8678225225454078


# from sklearn.model_selection import GridSearchCV
# from xgboost import XGBRegressor

# # Define your XGBoost model
# xgb_model = XGBRegressor()

# # Define the hyperparameter grid
# param_grid = {
#     'learning_rate': [0.01, 0.1, 0.2, 0.3],
#     'n_estimators': [50, 100, 200, 300],
#     'max_depth': [3, 5, 7, 9],
#     'min_child_weight': [1, 3, 5, 7],
#     'subsample': [0.7, 0.8, 0.9],
#     'colsample_bytree': [0.7, 0.8, 0.9],
#     'gamma': [0, 0.1, 0.2, 0.3],
#     'scale_pos_weight': [1, 2, 3]
# }

# # Create GridSearchCV object
# grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

# # Fit the model to the data
# grid_search.fit(x_train, y_train)

# # Print the best parameters and corresponding RMSE
# print("Best Parameters: ", grid_search.best_params_)
# print("Best RMSE: ", (-grid_search.best_score_) ** 0.5)

# # Get the best model
# best_xgb_model = grid_search.best_estimator_


# fit model no training data
model = XGBRegressor(
    colsample_bytree=0.8,
    gamma=0,
    learning_rate=0.1,
    max_depth=3,
    min_child_weight=7,
    n_estimators=300,
    scale_pos_weight=1,
    subsample=0.9
)
model.fit(x_train, y_train)


# Test the model
y_pred = model.predict(x_test)

# Evaluate the model
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming y_pred is the predictions from your regression model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test, y_pred)


print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r_squared)

Mean Absolute Error: 0.10013308511851164
Mean Squared Error: 0.021651133900594074
Root Mean Squared Error (RMSE): 0.14714324279624286
R-squared: 0.8839787486786836


test.isna().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64


test.shape

(1459, 80)


# ExterQual
mapping = {'Ex': 5,'Gd': 4, 'TA': 3,'Fa': 2,'Po': 1}
test['ExterQual'] = test['ExterQual'].map(mapping)
test['ExterCond'] = test['ExterCond'].map(mapping)
test['HeatingQC'] = test['HeatingQC'].map(mapping)
test['KitchenQual'] = test['KitchenQual'].map(mapping)
test['BsmtQual'] = test['BsmtQual'].map(mapping)
test['BsmtCond'] = test['BsmtCond'].map(mapping)
test['FireplaceQu'] = test['FireplaceQu'].map(mapping)
test['PoolQC'] = test['PoolQC'].map(mapping)


mapping_2 = {'Gd': 3,'Av': 2, 'Mn': 1}
test['BsmtExposure'] = test['BsmtExposure'].map(mapping_2)

mapping_3 = {'GLQ': 6,'ALQ': 5, 'BLQ': 4,'Rec': 3,'LwQ': 2, 'Unf': 1}
test['BsmtFinType1'] = test['BsmtFinType1'].map(mapping_3)
test['BsmtFinType2'] = test['BsmtFinType2'].map(mapping_3)

mapping_4 = {'Ex': 5,'Gd': 4, 'TA': 3,'Fa': 2,'Po': 1}
test['GarageQual'] = test['GarageQual'].map(mapping_4)
test['GarageCond'] = test['GarageCond'].map(mapping_4)

mapping_5 = {'N': 0, 'Y': 1}
test['CentralAir'] = test['CentralAir'].map(mapping_5)

mapping_6 = {'Reg': 4, 'IR1': 3, 'IR2': 2, 'IR3': 1}
test['LotShape'] = test['LotShape'].map(mapping_6)

mapping_7 = {'Gtl': 3, 'Mod': 2, 'Sev': 1}
test['LandSlope'] = test['LandSlope'].map(mapping_7)


test.shape

(1459, 80)


test['LotFrontage'].fillna(test['LotFrontage'].mean(), inplace =True)
test['Alley'].fillna('NA', inplace =True)
test['MasVnrType'].fillna('None', inplace =True)
test['MasVnrArea'].fillna(0, inplace =True)
test['BsmtQual'].fillna(test['BsmtQual'].mean(), inplace =True)
test['BsmtCond'].fillna(test['BsmtCond'].mean(), inplace =True)
test['BsmtExposure'].fillna(test['BsmtExposure'].mean(), inplace =True)
test['BsmtFinType1'].fillna(test['BsmtFinType1'].mean(), inplace =True)
test['BsmtFinType2'].fillna(test['BsmtFinType2'].mean(), inplace =True)
test['Electrical'].fillna('None', inplace =True)
test['FireplaceQu'].fillna(test['FireplaceQu'].mean(), inplace =True)
test['GarageType'].fillna('None', inplace =True)
test['GarageYrBlt'].fillna(test['GarageYrBlt'].mean(), inplace =True)
test['GarageFinish'].fillna('None', inplace =True)
test['GarageQual'].fillna('None', inplace =True)
test['GarageCond'].fillna('None', inplace =True)
test['PoolQC'].fillna(test['PoolQC'].mean(), inplace =True)
test['Fence'].fillna('None', inplace =True)
test['MiscFeature'].fillna('None', inplace =True)
test['MSZoning'].fillna('None', inplace =True)
test['SaleType'].fillna('Oth', inplace =True)
test['Utilities'].fillna('None', inplace =True)
test['Exterior1st'].fillna('Other', inplace =True)
test['Exterior2nd'].fillna('Other', inplace =True)
test['BsmtFinSF1'].fillna(0, inplace =True)
test['BsmtFinSF2'].fillna(0, inplace =True)
test['BsmtUnfSF'].fillna(0, inplace =True)
test['TotalBsmtSF'].fillna(0, inplace =True)
test['BsmtFullBath'].fillna(0, inplace =True)
test['BsmtHalfBath'].fillna(0, inplace =True)
test['KitchenQual'].fillna(test['KitchenQual'].mean(), inplace =True)
test['Functional'].fillna('None', inplace =True)
test['GarageCars'].fillna(0, inplace =True)
test['GarageArea'].fillna(0, inplace =True)


test_na = test.isna().sum()
test_na[test_na > 0]

Series([], dtype: int64)


test_clean = test


test_clean.shape

(1459, 80)


test_clean['GarageYrBlt'].dtype
test_clean['GarageYrBlt'] = 2023 - test_clean['GarageYrBlt']
test_clean['YearBuilt'] = 2023 - test_clean['YearBuilt']
test_clean['YearRemodAdd'] = 2023 - test_clean['YearRemodAdd']
test_clean['YrSold'] = 2023 - test_clean['YrSold']


test_clean['YearBuilt'].head()

0    62
1    65
2    26
3    25
4    31
Name: YearBuilt, dtype: int64


test_clean.rename(columns={'GarageYrBlt': 'GarageAge'}, inplace=True)
test_clean.rename(columns={'YearBuilt': 'YearBuiltAge'}, inplace=True)
test_clean.rename(columns={'YearRemodAdd': 'YearRemodAddAge'}, inplace=True)
test_clean.rename(columns={'YrSold': 'YrSoldAge'}, inplace=True)


# test_clean['Log_GarageAge'] = np.log(test_clean['GarageAge'])
# test_clean['Log_YearBuiltAge'] = np.log(test_clean['YearBuiltAge'])
# test_clean['Log_YearRemodAddAge'] = np.log(test_clean['YearRemodAddAge'])
# test_clean['Log_YrSoldAge'] = np.log(test_clean['YrSoldAge'])

# test_clean.drop(columns=['GarageAge', 'YearBuiltAge', 'YearRemodAddAge', 'YrSoldAge'])


test_clean_na = test_clean.isna().sum()
test_clean_na[test_clean_na > 0]

Series([], dtype: int64)


#test_clean['Log_GarageAge'].fillna(test['Log_GarageAge'].mean(), inplace =True)


test_clean.shape

(1459, 80)


numerical_test_clean = test_clean.select_dtypes(include='number')


numerical_test_clean = numerical_test_clean.drop(columns=['Id'], axis=1)


numerical_test_clean.shape

(1459, 50)


numerical_test_clean.head()


scaled_numerical_test_clean = scale.fit_transform(numerical_test_clean)


test_principal_components = pca.transform(scaled_numerical_test_clean)


test_principal_components.shape

(1459, 37)


columns = [f'PC{i+1}' for i in range(test_principal_components.shape[1])]
test_principal_df = pd.DataFrame(test_principal_components, columns=columns)


test_principal_df.head()


categorical_test_clean = test_clean.select_dtypes(exclude='number')


categorical_test_clean.shape

(1459, 29)


categorical_test = pd.get_dummies(categorical_test_clean, columns=list_to_encode)


# Assuming 'categorical_test' and 'categorical_df' are your DataFrames
missing_columns = set(categorical_df.columns) - set(categorical_test.columns)

# Add missing columns to 'categorical_test' and fill with zeros
for column in missing_columns:
    categorical_test[column] = 0


# Assuming 'categorical_test' and 'categorical_df' are your DataFrames
extra_columns = set(categorical_test.columns) - set(categorical_df.columns)

# Drop extra columns from 'categorical_test'
categorical_test = categorical_test.drop(columns=extra_columns, errors='ignore')


categorical_test.shape

(1459, 201)


concat_test = pd.concat([test_principal_df, categorical_test[selected_feature_names]], axis=1)


concat_test.shape

(1459, 37)


y_pred = rf.predict(concat_test)


y_pred = np.exp(y_pred)


y_pred = pd.DataFrame(y_pred)


test['SalePrice'] = y_pred


submission_4 = test[['Id','SalePrice']]


submission_4.head()


submission_4.to_csv('submission_4.csv', index=False)

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	...	PC28	PC29	PC30	PC31	PC32	PC33	PC34	PC35	PC36	PC37
0	2.189841	-0.167633	-1.471123	-2.262940	1.060092	-0.260601	-0.626813	0.068582	-0.784960	-1.201084	...	-0.129178	-0.060662	0.127597	0.328337	-0.118319	0.273451	-0.573404	0.108608	0.079679	0.186511
1	0.045846	-1.543234	1.093235	-0.001000	-0.789579	-1.115314	1.085005	-0.212486	2.294729	0.171581	...	1.353092	0.475002	-0.393422	-0.711873	-0.376811	0.245289	0.599826	-0.520741	0.980782	0.052239
2	2.326964	0.049691	-1.341410	-1.890272	0.213914	-0.145710	0.097192	0.821088	-0.401888	-1.050750	...	0.178433	0.148980	0.628293	0.511766	-0.199613	0.286282	-0.168346	0.049476	-0.383016	-0.092407
3	-0.131933	0.477460	0.602497	-0.474069	-0.403288	-0.902429	-0.835986	-1.001447	-0.538358	0.035704	...	-1.205494	-0.579009	-0.622643	-0.595745	-0.688999	-0.031591	-0.217376	-0.113522	-1.401679	-0.329459
4	4.671188	1.059574	0.364506	-1.838918	0.683339	-0.072945	0.449894	0.733929	0.351705	-0.504662	...	-0.255722	0.251739	-0.116956	0.034451	-0.711869	0.473764	0.147203	0.212041	-0.772847	-0.167668

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	...	PC28	PC29	PC30	PC31	PC32	PC33	PC34	PC35	PC36	PC37
0	2.189841	-0.167633	-1.471123	-2.262940	1.060092	-0.260601	-0.626813	0.068582	-0.784960	-1.201084	...	-0.129178	-0.060662	0.127597	0.328337	-0.118319	0.273451	-0.573404	0.108608	0.079679	0.186511
1	0.045846	-1.543234	1.093235	-0.001000	-0.789579	-1.115314	1.085005	-0.212486	2.294729	0.171581	...	1.353092	0.475002	-0.393422	-0.711873	-0.376811	0.245289	0.599826	-0.520741	0.980782	0.052239
2	2.326964	0.049691	-1.341410	-1.890272	0.213914	-0.145710	0.097192	0.821088	-0.401888	-1.050750	...	0.178433	0.148980	0.628293	0.511766	-0.199613	0.286282	-0.168346	0.049476	-0.383016	-0.092407
3	-0.131933	0.477460	0.602497	-0.474069	-0.403288	-0.902429	-0.835986	-1.001447	-0.538358	0.035704	...	-1.205494	-0.579009	-0.622643	-0.595745	-0.688999	-0.031591	-0.217376	-0.113522	-1.401679	-0.329459
4	4.671188	1.059574	0.364506	-1.838918	0.683339	-0.072945	0.449894	0.733929	0.351705	-0.504662	...	-0.255722	0.251739	-0.116956	0.034451	-0.711869	0.473764	0.147203	0.212041	-0.772847	-0.167668

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	...	PC28	PC29	PC30	PC31	PC32	PC33	PC34	PC35	PC36	PC37
0	-2.839040	-1.470161	0.664487	0.779536	-0.663260	0.444999	0.169294	1.049017	-0.625156	-0.273826	...	-0.310042	0.407158	-0.646351	-0.558659	0.486072	0.333655	-0.313075	0.456774	0.923647	-0.150863
1	-1.133623	-0.612153	2.279086	-0.536540	-0.587211	-1.377610	0.556120	2.859048	3.248475	-2.617143	...	-0.902030	-0.339549	-0.888399	0.741832	-0.399486	0.146729	-0.379392	1.920819	-0.020897	-0.402737
2	0.759764	-0.246478	0.006912	-1.955643	0.838348	0.437214	1.329624	0.224668	0.030200	-2.362899	...	0.261019	-0.570169	-0.114005	-0.428628	0.376088	0.006565	0.077427	-0.774392	0.974675	-1.026337
3	1.445369	0.251309	-0.348032	-1.589184	-0.202136	-0.480743	0.754694	-0.509421	-0.076047	-1.568251	...	0.013802	-0.099050	-0.044536	-0.814137	-0.280436	-0.618989	0.063194	-0.240648	0.348658	-1.909528
4	1.272475	-0.990319	-2.566121	0.613926	-0.023332	0.447076	-0.555119	0.019079	-1.038859	0.746275	...	-0.969416	-0.122335	-0.828462	0.188991	-0.495817	-0.240772	0.481708	1.247824	0.415142	-0.639205

	Id	SalePrice
0	1461	123104.018362
1	1462	164577.118041
2	1463	182534.158556
3	1464	201009.454464
4	1465	182511.697001

Data Exploration and Cleaning¶

Transform Hierarchy Categorical Data¶

Deal With Missing Value¶

LotFrontage¶

Alley¶

MasVnrType¶

MasVnrArea¶

BsmtQual¶

BsmtCond¶

BsmtExposure¶

BsmtFinType1¶

BsmtFinType2¶

Electrical¶

FireplaceQu¶

Garage¶

PoolQC¶

Fence¶

MiscFeature¶

Transform to yearold¶

Conduct PCA to Reduce Dimention for Numerical Variables¶

Encode Categorical Variables¶

Conduct Lasso to do feature selection¶

Combine numerical data and categorical data¶

Prepare Test Data¶

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

	MSSubClass	LotFrontage	LotArea	LotShape	LandSlope	OverallQual	OverallCond	YearBuiltAge	YearRemodAddAge	MasVnrArea	...	WoodDeckSF	OpenPorchSF	EnclosedPorch	PoolQC	MoSold	YrSoldAge
0	60	65.0	8450	4	3	7	5	20	20	196.0	...	0	61	0	3.714286	2	15
1	20	80.0	9600	4	3	6	8	47	47	0.0	...	298	0	0	3.714286	5	16
2	60	68.0	11250	3	3	7	5	22	21	162.0	...	0	42	0	3.714286	9	15
3	70	60.0	9550	3	3	7	5	108	53	0.0	...	0	35	272	3.714286	2	17
4	60	84.0	14260	3	3	8	5	23	23	350.0	...	192	84	0	3.714286	12	15

	MSZoning	Street	Alley	LandContour	Utilities	LotConfig	Neighborhood	Condition1	Condition2	BldgType	...	Functional	GarageType	GarageFinish	GarageQual	GarageCond	PavedDrive	Fence	MiscFeature	SaleType	SaleCondition
0	RL	Pave	NA	Lvl	AllPub	Inside	CollgCr	Norm	Norm	1Fam	...	Typ	Attchd	RFn	3.0	3.0	Y	None	None	WD	Normal
1	RL	Pave	NA	Lvl	AllPub	FR2	Veenker	Feedr	Norm	1Fam	...	Typ	Attchd	RFn	3.0	3.0	Y	None	None	WD	Normal
2	RL	Pave	NA	Lvl	AllPub	Inside	CollgCr	Norm	Norm	1Fam	...	Typ	Attchd	RFn	3.0	3.0	Y	None	None	WD	Normal
3	RL	Pave	NA	Lvl	AllPub	Corner	Crawfor	Norm	Norm	1Fam	...	Typ	Detchd	Unf	3.0	3.0	Y	None	None	WD	Abnorml
4	RL	Pave	NA	Lvl	AllPub	FR2	NoRidge	Norm	Norm	1Fam	...	Typ	Attchd	RFn	3.0	3.0	Y	None	None	WD	Normal

	MSSubClass	LotFrontage	LotArea	LotShape	LandSlope	OverallQual	OverallCond	YearBuiltAge	YearRemodAddAge	MasVnrArea	...	WoodDeckSF	OpenPorchSF	ScreenPorch	PoolQC	MiscVal	MoSold	YrSoldAge
0	20	80.0	11622	4	3	5	6	62	62	0.0	...	140	0	120	4.666667	0	6	13
1	20	81.0	14267	3	3	6	6	65	65	108.0	...	393	36	0	4.666667	12500	6	13
2	60	74.0	13830	3	3	5	5	26	25	0.0	...	212	34	0	4.666667	0	3	13
3	60	78.0	9978	3	3	6	6	25	25	20.0	...	360	36	0	4.666667	0	6	13
4	120	43.0	5005	3	3	8	5	31	31	0.0	...	0	82	144	4.666667	0	1	13